House Prices in Seattle

Using Plotly & Bokeh for Visualisation and Keras for Predicting

!!! MAPBOX TOKEN & GOOGLE API KEYS NOT N CODE BELOW !!!

Search for variables mapbox_access_token and GOOGLE_API_KEY below, uncomment their lines and input your keys. These keys are free to attain.

Imports

General

In [1]:
import pandas as pd
import numpy as np
import os
from math import pow
from math import log
import random as rd
from scipy.stats.kde import gaussian_kde
import colorcet as cc
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

Plotly

In [2]:
from plotly.offline import init_notebook_mode, iplot
from IPython.display import display, HTML
import plotly.graph_objs as go

Bokeh

In [38]:
from bokeh import plotting as bplt
from bokeh.plotting import gmap #, figure
from bokeh.io import output_notebook, show, output_file, push_notebook
from bokeh.models import NumeralTickFormatter, ColumnDataSource, HoverTool, CustomJS, Slider, GMapOptions, Toggle, LinearAxis, Range1d
from bokeh.models.callbacks import CustomJS
from bokeh.layouts import row, column
from bokeh.palettes import Category20_16, Category20_20
from bokeh.models.widgets import CheckboxGroup, Slider, Dropdown

Keras

In [92]:
import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, Activation, Conv2D, MaxPooling2D, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping
from keras.utils import plot_model
C:\Users\Luke\Anaconda3\lib\site-packages\h5py\__init__.py:36: FutureWarning:

Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

Using TensorFlow backend.

Exploring the Data

Download and Describe

In [4]:
df=pd.read_csv('kc_house_data.csv')
df.head()
df.isnull().values.any()
df.isna().values.any()
df.describe()
Out[4]:
id price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
count 2.161300e+04 2.161300e+04 21613.000000 21613.000000 21613.000000 2.161300e+04 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000
mean 4.580302e+09 5.400881e+05 3.370842 2.114757 2079.899736 1.510697e+04 1.494309 0.007542 0.234303 3.409430 7.656873 1788.390691 291.509045 1971.005136 84.402258 98077.939805 47.560053 -122.213896 1986.552492 12768.455652
std 2.876566e+09 3.671272e+05 0.930062 0.770163 918.440897 4.142051e+04 0.539989 0.086517 0.766318 0.650743 1.175459 828.090978 442.575043 29.373411 401.679240 53.505026 0.138564 0.140828 685.391304 27304.179631
min 1.000102e+06 7.500000e+04 0.000000 0.000000 290.000000 5.200000e+02 1.000000 0.000000 0.000000 1.000000 1.000000 290.000000 0.000000 1900.000000 0.000000 98001.000000 47.155900 -122.519000 399.000000 651.000000
25% 2.123049e+09 3.219500e+05 3.000000 1.750000 1427.000000 5.040000e+03 1.000000 0.000000 0.000000 3.000000 7.000000 1190.000000 0.000000 1951.000000 0.000000 98033.000000 47.471000 -122.328000 1490.000000 5100.000000
50% 3.904930e+09 4.500000e+05 3.000000 2.250000 1910.000000 7.618000e+03 1.500000 0.000000 0.000000 3.000000 7.000000 1560.000000 0.000000 1975.000000 0.000000 98065.000000 47.571800 -122.230000 1840.000000 7620.000000
75% 7.308900e+09 6.450000e+05 4.000000 2.500000 2550.000000 1.068800e+04 2.000000 0.000000 0.000000 4.000000 8.000000 2210.000000 560.000000 1997.000000 0.000000 98118.000000 47.678000 -122.125000 2360.000000 10083.000000
max 9.900000e+09 7.700000e+06 33.000000 8.000000 13540.000000 1.651359e+06 3.500000 1.000000 4.000000 5.000000 13.000000 9410.000000 4820.000000 2015.000000 2015.000000 98199.000000 47.777600 -121.315000 6210.000000 871200.000000

Print Ranges of House Prices

In [5]:
for i in range(10):
    a = df['price'][df['price']<100000*i].shape[0]
    b = df['price'][df['price']<100000*(i+1)].shape[0]
    print("Houses Sold for %s-%sk: %s" %(i*100,(i+1)*100,b-a))

for i in range(1,8):
    a = df['price'][df['price']<1000000*i].shape[0]
    b = df['price'][df['price']<1000000*(i+1)].shape[0]
    print("Houses Sold for %s-%sm: %s" %(i,i+1,b-a))
Houses Sold for 0-100k: 25
Houses Sold for 100-200k: 761
Houses Sold for 200-300k: 3651
Houses Sold for 300-400k: 4257
Houses Sold for 400-500k: 3714
Houses Sold for 500-600k: 2839
Houses Sold for 600-700k: 1954
Houses Sold for 700-800k: 1410
Houses Sold for 800-900k: 927
Houses Sold for 900-1000k: 583
Houses Sold for 1-2m: 1287
Houses Sold for 2-3m: 155
Houses Sold for 3-4m: 38
Houses Sold for 4-5m: 5
Houses Sold for 5-6m: 4
Houses Sold for 6-7m: 1
Houses Sold for 7-8m: 2

Print Ranges of Lot Area

In [6]:
df['sqft_lot15']
for i in range(10):
    a = df['sqft_lot15'][df['sqft_lot15']<1000*i].shape[0]
    b = df['sqft_lot15'][df['sqft_lot15']<1000*(i+1)].shape[0]
    print("Houses with %s-%sk square feet: %s" %(i,(i+1),b-a))

i=1
a = df['sqft_lot15'][df['sqft_lot15']<10000*i].shape[0]
b = df['sqft_lot15'][df['sqft_lot15']<10000*(i+1)].shape[0]
print("Houses with %s-%sk square feet: %s" %(i*10,(i+1)*10,b-a))

print("Houses with >20k square feet: %s" %(df['sqft_lot15'][df['sqft_lot15']>=20000].shape[0]))
Houses with 0-1k square feet: 57
Houses with 1-2k square feet: 1024
Houses with 2-3k square feet: 455
Houses with 3-4k square feet: 1086
Houses with 4-5k square feet: 2100
Houses with 5-6k square feet: 2395
Houses with 6-7k square feet: 1870
Houses with 7-8k square feet: 2877
Houses with 8-9k square feet: 2449
Houses with 9-10k square feet: 1705
Houses with 10-20k square feet: 3662
Houses with >20k square feet: 1933

Print Ranges of Living Space

In [7]:
print("Houses with less than 1k square feet of living space: %s" %(df['sqft_living15'][df['sqft_living15']<1000].shape[0]))
for i in range(0,8):
    a = df['sqft_lot'][df['sqft_living15']<250*i+1000].shape[0]
    b = df['sqft_lot'][df['sqft_living15']<250*(i+1)+1000].shape[0]
    print("Houses with %s-%s square feet of living space: %s" %(250*i+1000,(i+1)*250+1000,b-a))
print("Houses with more than 3k square feet of living space: %s" %(df['sqft_living15'][df['sqft_living15']>=3000].shape[0]))
Houses with less than 1k square feet of living space: 348
Houses with 1000-1250 square feet of living space: 1684
Houses with 1250-1500 square feet of living space: 3505
Houses with 1500-1750 square feet of living space: 3936
Houses with 1750-2000 square feet of living space: 3242
Houses with 2000-2250 square feet of living space: 2529
Houses with 2250-2500 square feet of living space: 1927
Houses with 2500-2750 square feet of living space: 1483
Houses with 2750-3000 square feet of living space: 1057
Houses with more than 3k square feet of living space: 1902

Get Price Names

In [8]:
k = round(df['price']/1000,0).astype(int)
df['price_names'] = '$' + k.astype(str) + 'k'
for i in range(df['price_names'].shape[0]):
    if len(df['price_names'][i])>5:
        num = round(int(df['price_names'][i][1:(len(df['price_names'][i])-1)])/1000,1)
        df.loc[i,'price_names'] = '$' + str(num) + 'm'

Get Months

In [9]:
months = ['May 14','Jun 14','Jul 14','Aug 14','Sep 14','Oct 14','Nov 14','Dec 14','Jan 15','Feb 15','Mar 15','Apr 15','May 15']
months_in_date_form = ['201405','201406','201407','201408','201409','201410','201411','201412','201501','201502','201503','201504','201505']
date = dict(zip(months_in_date_form,months))
df['month'] = df['date']
for i in range(df.date.shape[0]):
    df.loc[i,'month'] = date[df.loc[i,'date'][:6]]

Get Year Built or Renovated, whichever is more recent

In [10]:
df['max_yr_built_or_renovated'] = df['yr_built']
index = df[df['yr_renovated']>0].index.values
for i in index:
    df.loc[i,'max_yr_built_or_renovated'] = max(df.loc[i,'yr_built'],df.loc[i,'yr_renovated'])

Visualising the Data

Plotly

In [11]:
init_notebook_mode(connected=True)

House Prices on Map

In [12]:
bins = [0,300000,500000,700000,8000000]
df['binned'] = pd.cut(df['price'], bins).astype(str).tolist()
subset = df[['lat','long','price','binned','price_names','month']]
subs={}
for t in range(df['binned'].unique().shape[0]):
    subs["s"+str(t)+""]=subset[subset['binned']==np.sort(df['binned'].unique())[t]]
In [13]:
#mapbox_access_token = '*******************************************************************************************'
colours = ['rgb(255,165,0)','rgb(255,140,0)','rgb(255,0,0)','rgb(128,0,0)']
labels = ["< $0.3m","$0.3-0.5m","$0.5-0.7m","> $0.7m" ]
traces = []
for i in range(4):
    x = subs['s'+str(i)]
    col = colours[i]
    traces.append(go.Scattermapbox(
        lat=x['lat'].astype(str).tolist(),
        lon=x['long'].astype(str).tolist(),
        mode='markers',
        marker = dict(
            size = 4,
            color = col,
        ),
        text=x['price_names'].tolist(),
        hoverinfo = 'text',
        name=labels[i]
))

data = traces
layout = go.Layout(
    autosize=True,
    hovermode='closest',
    title="2014-15 Sold House Prices in King County",
    showlegend=True,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=np.median(df['lat']),
            lon=np.median(df['long'])
        ),
        pitch=0,
        zoom=9
    ),
)

fig = dict(data=data, layout=layout)

iplot(fig, filename='Seattle Prices by Colour')

Hover mouse over location for price tag and zoom in for further inspection.

Animation and Interactivity

In [14]:
value=0
traces = []
for i in range(4):
    x = subs['s'+str(i)][subs['s'+str(i)]['month']==months[value]]
    col = colours[i]
    traces.append(go.Scattermapbox(
        lat=x['lat'].astype(str).tolist(),
        lon=x['long'].astype(str).tolist(),
        mode='markers',
        marker = dict(
            size = 7,
            color = col,
        ),
        text=x['price_names'].tolist(),
        hoverinfo = 'text',
        name=labels[i]
    ))
data = traces
    
sliders_dict = {
    'active': 0,
    'yanchor': 'top',
    'xanchor': 'left',
    'currentvalue': {
        'font': {'size': 20},
        'prefix': 'Month:',
        'visible': True,
        'xanchor': 'right'
    },
    'transition': {'duration': 300, 'easing': 'cubic-in-out'},
    'pad': {'b': 10, 't': 50},
    'len': 0.9,
    'x': 0.1,
    'y': 0,
    'steps': []
}

layout = go.Layout(
    autosize=True,
    height = 500,
    margin = {'l': 20, 'b': 30, 'r':10, 't': 100}, 
    hovermode='closest',
    title="Sold House Prices in King County",
    showlegend=True,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=np.median(df['lat']),
            lon=np.median(df['long'])
        ),
        pitch=0,
        zoom=9
    ),
    sliders = [sliders_dict
#         args = ['transition', dict(duration=400,easing='cubic-in-out')],
#         initialValue='May 14',
#         plotlycommand='animate',
#         values = months,
#         visible=True
    ],
    updatemenus = [dict(
        buttons = [
            dict(
                args= [None,dict(frame=dict(duration=500,redraw=False),fromcurrent=True, 
                                 transition=dict(duration=300,easing='quadratic-in-out'))],
                label='Play',
                method='animate'
            ),
            dict(
                args= [None,dict(frame=dict(duration=0,redraw=False),mode='immediate', 
                                 transition=dict(duration=0))],
                label='Pause',
                method='animate'
            )],
        direction = 'left',
        pad = {'r': 10, 't': 87},
        showactive = False,
        type = 'buttons',
        x = 0.1,
        xanchor = 'right',
        y = 0,
        yanchor = 'top'
    )]
)
figure = dict(data=data, layout=layout)

figure['frames'] = []
    
# make frames
for value in range(len(months)):
    frame = {'data': [], 'name': months[value]}
    for i in range(4):
        x = subs['s'+str(i)][subs['s'+str(i)]['month']==months[value]]
        col = colours[i]
        frame['data'].append(go.Scattermapbox(
            lat=x['lat'].astype(str).tolist(),
            lon=x['long'].astype(str).tolist(),
            mode='markers',
            marker = dict(
                size = 7,
                color = col,
            ),
            text=x['price_names'].tolist(),
            hoverinfo = 'text',
            name=labels[i]
        ))
    figure['frames'].append(frame)
    slider_step = {'args': [
        [months[value]],
        {'frame': {'duration': 300, 'redraw': False},
         'mode': 'immediate',
       'transition': {'duration': 300}}
     ],
     'label': months[value],
     'method': 'animate'}
    sliders_dict['steps'].append(slider_step)

    
figure['layout']['sliders'] = [sliders_dict]

iplot(figure)

Play through months to see how prices change in various locations over time. Or select a month from the slider.

Coloured Scatterplot

In [15]:
dataPoints = go.Scattergl(
    x=df['sqft_living15'],
    y=np.log(df['price']),
    mode='markers',
    marker=dict(
        opacity=0.75,
        color=df.grade,
        showscale=True,
        colorscale='Jet',
        colorbar=dict(
            title='Grade'
        ),
    ),
    name='Data points'
)

data=[dataPoints]

layout = go.Layout(
    title='Log Price vs Living Space',
    xaxis=dict(
        title='Living Space (Square Feet)'
    ),
    yaxis=dict(
        title='Log(House Price)'
    ),
    #showlegend=True
)

figure = go.Figure(data=data, layout=layout)

iplot(figure)
In [16]:
temp_df = df[df['price']<1000000]

data = [
    {
        'x': temp_df['max_yr_built_or_renovated'],
        'y': temp_df['sqft_living15'],
        'mode': 'markers',
        'marker': {
            'color': temp_df['price'],
            'showscale': True,
            'colorbar': {'title':'Price'}
        },
        'name':'Price'
    }
]

layout = go.Layout(
    title = "Price (Capped at $1m) vs Square Feet of Living Space and Year Built or Renovated",
    yaxis=dict(title='Living Space (Square Feet)'),
    xaxis=dict(title='Year Built or Renovated')
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter-colorscale')

Multi-Axis Line Plot

In [17]:
monthly_median_prices = []
monthly_volume = []
for m in range(len(months)):
    temp_df = df[df['month']==months[m]]
    monthly_median_prices.append(int(np.median(temp_df['price'])))
    monthly_volume.append(temp_df.shape[0])
In [18]:
trace1 = go.Scatter(
    x=months,
    y=monthly_median_prices,
    name='Median Price'
)
trace2 = go.Scatter(
    x=months,
    y=monthly_volume,
    name='Sales',
    yaxis='y2'
)
data = [trace1, trace2]
layout = go.Layout(
    title='2014-15 King County Real Estate Market Activity',
    yaxis=dict(
        title='Median Price'
    ),
    yaxis2=dict(
        title='Sales',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='multiple-axes-double')

Prices seem to track volume. A greater amount of houses sold indicates greater demand, and hence inflated prices.

In [19]:
bounds = [0]+[1000]+[1000+i*250 for i in range(1,9)] + [10000]
df['sqft_liv15_binned'] = pd.cut(df['sqft_living15'], bounds).astype(str).tolist()
bins = np.sort(df['sqft_liv15_binned'].unique())
median_prices = []
median_bedrooms = []
mid_bounds = []
for j in range(len(bounds)-1):
    median_prices.append(np.median(df['price'][df['sqft_liv15_binned']==bins[j]]))
    median_bedrooms.append(np.median(df['bedrooms'][df['sqft_liv15_binned']==bins[j]]))
    mid_bounds.append(round((bounds[j]+bounds[j+1])/2,0))
In [20]:
trace1 = go.Scatter(
    x=bounds,
    y=median_prices,
    name='Median Price'
)
trace2 = go.Scatter(
    x=bounds,
    y=median_bedrooms,
    name='Median Bedroom',
    yaxis='y2'
)
data = [trace1, trace2]
layout = go.Layout(
    title='Living Space vs Price & Bedrooms ',
    xaxis=dict(title='Living Space (Square Feet)'),
    yaxis=dict(
        title='Median Price'
    ),
    yaxis2=dict(
        title='Median Bedrooms',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='binned_sqft')

Violin Plot

In [21]:
variable = 'view'

def violin_plot(variable):
    temp_df = df[df['price']<1000000]
    data = []
    for i in range(0,len(pd.unique(df[variable]))):
        trace = {
            "type": 'violin',
            "x": temp_df[variable][temp_df[variable] == np.sort(temp_df[variable].unique())[i]],
            "y": temp_df['price'][temp_df[variable] == np.sort(temp_df[variable].unique())[i]],
            "name": np.sort(temp_df[variable].unique())[i],
            "box": {
                "visible": True
            },
            "meanline": {
                "visible": True
            }
        }
        data.append(trace)

    fig = {
        "data": data,
        "layout" : {
            "title": "Violin Plot for Price (Capped at $1m) vs " + variable.title(),
            "yaxis": {
                "zeroline": False,
                "title": "Price ($)"
            },
            "xaxis": {
                "title": variable.title()
            }
        }
    }
    return fig
In [22]:
iplot(violin_plot('view'), filename='violin', validate = False)

Radar Plot

In [23]:
# price, bedrooms, floors, yr_built
wf = []
not_wf = []
wf_df = df[df['waterfront']==1]
not_wf_df = df[df['waterfront']==0]
variables = ['price','bedrooms','grade','sqft_lot15','sqft_living15','condition']
for i in variables:
    temp_val = np.median(df[i])
    wf.append(round(np.median(wf_df[i])/temp_val,2))
    not_wf.append(round(np.median(not_wf_df[i])/temp_val,2))
In [24]:
trace1 = go.Scatterpolar(
    r = wf,
    theta = variables,
    fill = 'toself',
    name = "Waterfront"
)

trace2 = go.Scatterpolar(
    r = not_wf,
    theta = variables,
    fill = 'toself',
    name = "Not Waterfront"
)

data = [trace1,trace2]

layout = go.Layout(
    title = "How Does the Median Waterfront House Compare to the Median House Sold?<br>Median House Has A Value of 1 For Each Attribute",
    polar = dict(
        radialaxis = dict(
            visible = True,
            range = [0, 4],
            tick0=0,
            dtick=1
        )
    ),
    showlegend = True
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "radar")

So houses on the waterfront tend to have 3x the price of non-waterfront houses and twice the sqft, but no difference in number of bedrooms or grade or condition.

Bokeh

In [39]:
output_notebook()
Loading BokehJS ...
In [34]:
# Styling for a plot
def style(p):
    # Title 
    p.title.align = 'center'
    p.title.text_font_size = '20pt'
    p.title.text_font = 'serif'

    # Axis titles
    p.xaxis.axis_label_text_font_size = '14pt'
    p.xaxis.axis_label_text_font_style = 'bold'
    p.yaxis.axis_label_text_font_size = '14pt'
    p.yaxis.axis_label_text_font_style = 'bold'

    # Tick labels
    p.xaxis.major_label_text_font_size = '12pt'
    p.yaxis.major_label_text_font_size = '12pt'

    return p

Histogram of Prices

In [41]:
""""""
temp_df = df['price']/1000
hist, edges = np.histogram(temp_df, 
                               bins = 40, 
                               range = [0, 1500])
# Put the information in a dataframe
prices = pd.DataFrame({'houses': hist, 
                       'left': edges[:-1], 
                       'right': edges[1:]})
# Add a column showing the extent of each interval
prices['interval'] = ['%s - %s' % (int(round(left,0)), int(round(right,0))) for left, right in zip(prices['left'], prices['right'])]
In [57]:
# Convert dataframe to column data source
src = ColumnDataSource(prices)

# Create the blank plot
p = bplt.figure(plot_height = 600, plot_width = 600, 
           title = 'Histogram of House Prices (Capped at $1.5m)',
          x_axis_label = 'Price (in thousands of dollars)', 
           y_axis_label = 'Number of Houses')

# Add a quad glyph
p.quad(source = src, bottom=0, top='houses', left='left', right='right', fill_color='green', line_color='black',
       hover_fill_alpha = 1.0, hover_fill_color = 'red')

# Hover tool referring to our own data field using @ and
# a position on the graph using $
h = HoverTool(tooltips = [('Price Range ($k)', '@interval'),('Number of Houses Sold', '@houses')])

# Add the hover tool to the graph
p.add_tools(h)

p.xaxis.formatter=NumeralTickFormatter(format="00")

style(p)
# Specify the output file and save
#output_file('hist.html')
# Show the plot
show(p)

Hover over for information about bar.

Interactive Histogram Split by Another Variable

In [58]:
def make_dataset(value_list=np.sort(df['view'].unique()).tolist(), 
                 range_start = 0, range_end = 1500, bin_width = 200, value='view'):
    # Check to make sure the start is less than the end!
    assert range_start < range_end, "Start must be less than end!"
    
    by_value = pd.DataFrame(columns=['proportion', 'left', 'right', 
                                       'h_proportion', 'interval',
                                       'name', 'color'])
    range_extent = range_end - range_start
    
    # Ignore dividing by 0
    #np.seterr(divide='ignore', invalid='ignore')
    
    # Iterate through all the carriers
    for i, value_name in enumerate(value_list):

        # Subset to the carrier
        subset = df['price'][df[value] == value_name]/1000

        # Create a histogram with specified bins and range
        hist, edges = np.histogram(subset, bins = int(range_extent / bin_width), range = [range_start, range_end])

        # Divide the counts by the total to get a proportion and create df
        prices = pd.DataFrame({'proportion': hist / np.sum(hist), 
                               'left': edges[:-1], 'right': edges[1:] })

        # Format the proportion 
        prices['h_proportion'] = ['%0.5f' % proportion for proportion in prices['proportion']]

        # Format the interval
        prices['interval'] = ['%s - %s' % (int(round(left,0)), int(round(right,0))) for left, right in zip(prices['left'], prices['right'])]

        # Assign the carrier for labels
        prices['name'] = value_name

        # Color each carrier differently
        prices['color'] = Category20_16[i]

        # Add to the overall dataframe
        by_value = by_value.append(prices)

    # Overall dataframe
    by_value = by_value.sort_values(['name', 'left'])
    
    # Convert dataframe to column data source
    return ColumnDataSource(by_value) #by_value.to_dict('list')
In [59]:
def make_plot(src, value):
        # Blank plot with correct labels
        p = bplt.figure(plot_width = 700, plot_height = 700, 
                  title = 'Histogram of House Prices by '+value.title(),
                  x_axis_label = 'Price ($K)', y_axis_label = 'Proportion')

        # Quad glyphs to create a histogram
        p.quad(source = src, bottom = 0, top = 'proportion', left = 'left', right = 'right',
               color = 'color', fill_alpha = 0.7, hover_fill_color = 'color', legend = 'name',
               hover_fill_alpha = 1.0, line_color = 'black')

        # Hover tool with vline mode
        h = HoverTool(tooltips=[(value.title(), '@name'), 
                                    ('Price Range ($k)', '@interval'),
                                    ('Proportion', '@h_proportion')],
                          mode='vline')

        p.add_tools(h)

        # Styling
        style(p)

        return p 
In [60]:
# variable='view'
# variable_list=np.sort(df[variable].unique()).tolist()
# src = make_dataset(value_list=variable_list,range_start = 0, range_end = 1500, bin_width = 50, value=variable)
# p = make_plot(src, variable)
# show(p)
In [61]:
def graph(source, colour):
        return p.quad(source = source, bottom = 0, top = 'proportion', left = 'left', right = 'right',
                   color= colour, fill_alpha = 0.7, hover_fill_color = 'color', legend = 'name',
                   hover_fill_alpha = 1.0, line_color = 'black')

callback1 = CustomJS(code="""
if (IPython.notebook.kernel !== undefined) {
    var kernel = IPython.notebook.kernel;
    cmd = "update_bw(" + cb_obj.value + ")";
    kernel.execute(cmd, {}, {});
}
""") 

def hist_start(title):
    plt = bplt.figure(plot_width = 700, plot_height = 500, 
                      title = 'Histogram of House Prices by '+title.title(),
                      x_axis_label = 'Price ($K)', y_axis_label = 'Proportion')
    style(plt)
    return plt
In [62]:
variable = 'view'
p = hist_start(variable)
variable_list = np.sort(df[variable].unique()).tolist()
args = []
code = "active = cb_obj.active;"
sources = []
rd.shuffle(Category20_20)
for i in range(len(variable_list)):
    sources += [make_dataset(value_list=[variable_list[i]], value=variable)]
    glyph = graph(sources[i], colour=Category20_20[i])
    args += [('glyph'+str(i),glyph)]
    code += "glyph{}.visible = active.includes({});".format(i,i)

def update_bw(bin_wd):
        for i in range(len(variable_list)):
            new_src = make_dataset(value_list=[variable_list[i]], bin_width = bin_wd, value=variable)
            sources[i].data = new_src.data
        push_notebook(handle=bokeh_handle)
    
bin_select = Slider(start = 10, end = 500,
                    step = 10, value = 50,
                    title = 'Width of Price Ranges ($K)',callback=callback1)
checkbox = CheckboxGroup(labels=[str(i) for i in variable_list],\
                         active=[i for i in range(len(variable_list))],\
                         callback = CustomJS(args={key:value for key,value in args},code=code))
bokeh_handle = show(row(p, column(bin_select, checkbox)), notebook_handle=True)

Need to use Bokeh server to use tabs and change variable

In [63]:
variable = 'floors'
p = hist_start(variable)
variable_list = np.sort(df[variable].unique()).tolist()
args = []
code = "active = cb_obj.active;"
sources = []
rd.shuffle(Category20_20)
for i in range(len(variable_list)):
    sources += [make_dataset(value_list=[variable_list[i]], value=variable)]
    glyph = graph(sources[i], colour=Category20_20[i])
    args += [('glyph'+str(i),glyph)]
    code += "glyph{}.visible = active.includes({});".format(i,i)

def update_bw(bin_wd):
        for i in range(len(variable_list)):
            new_src = make_dataset(value_list=[variable_list[i]], bin_width = bin_wd, value=variable)
            sources[i].data = new_src.data
        push_notebook(handle=bokeh_handle)
    
bin_select = Slider(start = 10, end = 500, 
                    step = 10, value = 50,
                    title = 'Width of Price Ranges ($K)',callback=callback1)
checkbox = CheckboxGroup(labels=[str(i) for i in variable_list],\
                         active=[i for i in range(len(variable_list))],\
                         callback = CustomJS(args={key:value for key,value in args},code=code))
bokeh_handle = show(row(p, column(bin_select, checkbox)), notebook_handle=True)

Bar Chart

In [64]:
drop_options=[
    {'label': 'Bedrooms', 'value': 'bedrooms'},
    {'label': 'Bathrooms', 'value': 'bathrooms'},
    {'label': 'Views', 'value': 'view'},
    {'label': 'Floors', 'value': 'floors'},
    {'label': 'Condition', 'value': 'condition'},
    {'label': 'Grade', 'value': 'grade'},
    {'label': 'Waterfront', 'value': 'waterfront'},
    {'label': 'Year Built', 'value': 'yr_built'},
    {'label': 'Year Renovated', 'value': 'yr_renovated'}]

def median_price(variable):
    lab = ''
    for t in range(len(drop_options)):
        if drop_options[t]['value'] == variable:
            lab = drop_options[t]['label']
    if variable == 'yr_renovated':
        labels = np.sort(eval('df.'+variable+'.unique()[1:]'))
    else: 
        labels = np.sort(eval('df.'+variable+'.unique()'))
    values = labels.astype(float)
    for i in range(values.shape[0]):
        values[i] = int(np.median(df['price'][df[variable]==labels[i]])/1000)
    xaxis_title = 'Number of '+ lab.title()
    return([str(l) for l in labels],values.tolist(),xaxis_title)
In [65]:
def bar_chart_bokeh(variable):
    rd.shuffle(Category20_20)
    labs,vals,xtitle = median_price(variable)
    p = bplt.figure(x_range=labs, plot_height=400, plot_width = 700, x_axis_label=xtitle, \
               y_axis_label='Median Price ($K)', title="Median Price vs "+xtitle)
    p.vbar(x=labs, top=vals, width=3.6/len(labs), color=Category20_20[0], line_color='black', \
           hover_fill_alpha = 1.0, hover_fill_color = Category20_20[1])
    p.xgrid.grid_line_color = None
    p.y_range.start = 0
    h = HoverTool(tooltips = [('Median Price ($k)', '@top'),(xtitle, '@x')])
    p.add_tools(h)
    style(p)
    show(p)
In [66]:
bar_chart_bokeh('view')
In [67]:
bar_chart_bokeh('floors')

Bokeh, unlike Dash, is bad for switching between different plots. It requires a changing of data source. It is also difficult to use in Jupyter Notebook, where it needs javascript commands to operate. It is much easier to use in the bokeh server.

Mapping Data

In [68]:
bins = [0,300000,500000,700000,8000000]
df['binned'] = pd.cut(df['price'], bins).astype(str).tolist()
subset = df[['lat','long','price','binned','price_names','month']]
subs={}
for t in range(df['binned'].unique().shape[0]):
    subs["s"+str(t)+""]=subset[subset['binned']==np.sort(df['binned'].unique())[t]]
In [69]:
#GOOGLE_API_KEY = "**************************"

#output_file("gmap.html")

map_options = GMapOptions(lat=np.median(df['lat'])-0.05, lng=np.median(df['long']), map_type="roadmap", zoom=10)

# For GMaps to function, Google requires you obtain and enable an API key:
#
#     https://developers.google.com/maps/documentation/javascript/get-api-key
#
# Replace the value below with your personal API key:p = gmap(GOOGLE_API_KEY, map_options, title="King County")
p = gmap(GOOGLE_API_KEY, map_options, title="2014-15 Sold House Prices in King County")
                                        
colours = ['rgb(255,165,0)','rgb(255,140,0)','rgb(255,0,0)','rgb(128,0,0)']
legend_labels = ['< $300k','$300-500k','$500-700k','> $700k']

sources = []
for i in range(4):
    col = colours[i]
    x = subs['s'+str(i)]
    sources.append(ColumnDataSource(x))
    p.circle(x="long", y="lat", size=4, fill_color=col, line_color=col, fill_alpha=0.8, source=sources[i], legend=legend_labels[i])
    h = HoverTool(tooltips=[('Price', '@price_names')])
    p.add_tools(h)

# Styling
style(p)
p.legend.location = "top_left"
p.legend.click_policy="hide"
show(p)

Animated Map

In [70]:
map_options = GMapOptions(lat=np.median(df['lat'])-0.05, lng=np.median(df['long']), map_type="roadmap", zoom=10)

# For GMaps to function, Google requires you obtain and enable an API key:
#
#     https://developers.google.com/maps/documentation/javascript/get-api-key
#
# Replace the value below with your personal API key:p = gmap(GOOGLE_API_KEY, map_options, title="King County")
p = gmap(GOOGLE_API_KEY, map_options, title="King County House Prices in May 2014")
                                        
colours = ['rgb(255,165,0)','rgb(255,140,0)','rgb(255,0,0)','rgb(128,0,0)']
legend_labels = ['< $300k','$300-500k','$500-700k','> $700k']

sources = []
for i in range(4):
    col = colours[i]
    x = subs['s'+str(i)]
    sources.append(ColumnDataSource(x))
    p.circle(x="long", y="lat", size=4, fill_color=col, line_color=col, fill_alpha=0.8, source=sources[i], legend=legend_labels[i])
    h = HoverTool(tooltips=[('Price', '@price_names')])
    p.add_tools(h)

# Styling
style(p)
p.legend.location = "top_left"
p.legend.click_policy="hide"

Dash is definitely smoother with maps (zooming in and out). But Bohek allows you to hide plots easily.

In [71]:
def update_map(month):
        for i in range(4):
            col = colours[i]
            x = subs['s'+str(i)][subs['s'+str(i)]['month'] == months[month]]
            sources[i].data = ColumnDataSource(x).data
        p.title.text = "King County House Prices in "+months[month][:4].title()+'20'+months[month][4:].title()
        push_notebook(handle=map_handle)
    
callback_map = CustomJS(code="""
if (IPython.notebook.kernel !== undefined) {
    var kernel = IPython.notebook.kernel;
    cmd = "update_map(" + cb_obj.value + ")";
    kernel.execute(cmd, {}, {});
}
""") 

month_slider = Slider(start = 0, end = 12,
                    step = 1, value = 0,
                    title = 'Months from May 2014',callback=callback_map)
In [40]:
callback_animate = CustomJS(code="""
        var f = cb_obj.active;
        var j = 0;

        if(f == true){
            mytimer = setInterval(replace_data, 500);             
        } else {
            clearInterval(mytimer);
        }

        function replace_data() {
             j++;
             if(j>12){
                 j = 0;
             }
             var kernel = IPython.notebook.kernel;
             cmd = "update_map(" + j + ")";
             kernel.execute(cmd, {}, {});
        }
""")

btn = Toggle(label="Play/Stop Animation of All Months", button_type="success",
             active=False, callback=callback_animate)

map_handle = show(row(p, column(btn,month_slider)), notebook_handle=True)

Ridge Plot

In [72]:
def ridge(category, data, scale=300):
    return list(zip([category]*len(data), scale*data))

def ridge_plot(variable):
    cats = np.sort(df[variable].unique()).tolist()
    labels = [str(i) for i in cats]
    palette = [cc.rainbow[i*15*int(17/len(cats))] for i in range(len(cats))]
    
    if variable == 'view':
        ylabel = 'Number of '+variable.title()+'s'
    else:
        ylabel = 'Number of '+variable.title()
    x = np.linspace(0,1500, 500)
    source = ColumnDataSource(data=dict(x=x))
    p = bplt.figure(y_range=labels, plot_width=900, x_range=(0, 1500), toolbar_location=None, title="Ridge Plot for "+variable.title(),
               x_axis_label = 'Price ($K)', y_axis_label = ylabel)

    for i, cat in enumerate(reversed(labels)):
        pdf = gaussian_kde((df['price'][df[variable]==cats[len(labels)-1-i]]/1000).astype(int))
        y = ridge(cat, pdf(x))
        source.add(y, cat)
        p.patch('x', cat, color=palette[i], alpha=0.6, line_color="black", source=source)

    p.outline_line_color = None
    p.background_fill_color = "#efefef"

    p.ygrid.grid_line_color = None
    p.xgrid.grid_line_color = "#dddddd"
    p.xgrid.ticker = p.xaxis[0].ticker

    p.axis.minor_tick_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.axis_line_color = None

    p.y_range.range_padding = 0.12

    style(p)
    show(p)
In [73]:
ridge_plot('view')
In [74]:
ridge_plot('floors')

Multi-Axis Plot

In [75]:
monthly_median_prices = []
monthly_volume = []
for m in range(len(months)):
    temp_df = df[df['month']==months[m]]
    monthly_median_prices.append(int(np.median(temp_df['price'])))
    monthly_volume.append(temp_df.shape[0])
In [76]:
source_pd = pd.DataFrame(
    {'months': months,
     'monthly_median_prices': [i/1000 for i in monthly_median_prices],
     'monthly_volume': monthly_volume
    })
source = ColumnDataSource(source_pd)
In [77]:
p = bplt.figure(plot_width=900, x_range = months,y_range=(min(monthly_median_prices)*0.000995,max(monthly_median_prices)*0.00105),\
           title="2014-15 King County Real Estate Market Activity",\
           x_axis_label = 'Month', y_axis_label = "Median Price ($K)")

p.line('months', 'monthly_median_prices', line_width=2, color='blue', legend="Median Price Sold", source=source)
p.extra_y_ranges = {"second_axis": Range1d(start=min(monthly_volume)*0.95, end=max(monthly_volume)*1.05)}
p.add_layout(LinearAxis(y_range_name="second_axis", axis_label="Number of Houses Sold"), 'right')
p.line('months', 'monthly_volume', line_width=2, y_range_name="second_axis", color='red', legend="Number of Houses Sold", source=source)
h = HoverTool(tooltips=[('Month', '@months'),('Median Price ($K)', '@monthly_median_prices'),('Number of Houses Sold', '@monthly_volume')])
p.add_tools(h)
p.legend.location = "bottom_left"
p.legend.click_policy="hide"
style(p)
show(p)

In Dash it automatically sets the hover tool.

Animated Scatterplot

In [78]:
bins = [100000*i for i in range(0,11)]+[8000000]
df['binned2'] = pd.cut(df['price'], bins).astype(str).tolist()
bins = ['(0, 100000]', '(100000, 200000]', '(200000, 300000]', '(300000, 400000]',\
        '(400000, 500000]', '(500000, 600000]', '(600000, 700000]',\
        '(700000, 800000]', '(800000, 900000]', '(900000, 1000000]', '(1000000, 8000000]',]
palette = [cc.rainbow[(len(bins)-1-i)*15*int(17/(len(bins)))] for i in range(len(bins))]
df['color'] = palette[0]
col_conv = dict(zip(bins,palette))

min_price = min(df['price'])
max_price = max(df['price'])
sizes = [5*i+5 for i in range(1,len(bins)+1)]
df['size'] = sizes[0]
size_conv = dict(zip(bins,sizes))

for i in df.index.values:
    df.loc[i,'color'] = col_conv[df.loc[i,'binned2']]
    df.loc[i,'size'] = size_conv[df.loc[i,'binned2']]
In [79]:
scatter_df = df[['price','bedrooms','sqft_living15','max_yr_built_or_renovated','color', 'size', 'price_names']][(df['yr_renovated']>0)\
                                                                                               &(df['bedrooms']<33)]
In [80]:
temp_df = scatter_df[scatter_df['max_yr_built_or_renovated']==2010]
source = ColumnDataSource(temp_df)

p = bplt.figure(plot_width=900, plot_height=350, title="Prices of Houses Built or Last Renovated in 2010",\
           x_axis_label = 'Living Space', y_axis_label = "Number of Bedrooms",\
           y_range = (0,max(scatter_df['bedrooms'])*1.05),\
           x_range = (min(scatter_df['sqft_living15'])*0.95,max(scatter_df['sqft_living15'])*1.05))
p.scatter(x='sqft_living15', y='bedrooms', color='color', size='size', source=source, alpha=0.8)
style(p)
h = HoverTool(tooltips=[('Price', '@price_names'),('Number of Bedrooms', '@bedrooms'),('Size of Living Space', '@sqft_living15')])
p.add_tools(h)

callback_scatter = CustomJS(code="""
if (IPython.notebook.kernel !== undefined) {
    var kernel = IPython.notebook.kernel;
    cmd = "update_scatter(" + cb_obj.value + ")";
    kernel.execute(cmd, {}, {});
}
""") 

def update_scatter(year):
        temp_df = scatter_df[scatter_df['max_yr_built_or_renovated']==year]
        source.data = ColumnDataSource(temp_df).data
        p.title.text = "Prices of Houses Built or Last Renovated in " + str(year)
        push_notebook(handle=scatter_handle)
    
year_select = Slider(start = min(scatter_df['max_yr_built_or_renovated']), end = max(scatter_df['max_yr_built_or_renovated']),
                    step = 1, value = 2010,
                    title = 'Year',callback=callback_scatter)
In [81]:
callback_animate = CustomJS(code="""
        var f = cb_obj.active;
        var j = 1965;

        if(f == true){
            mytimer = setInterval(replace_data, 250);             
        } else {
            clearInterval(mytimer);
        }

        function replace_data() {
             j++;
             if(j>2015){
                 j = 1965;
             }
             var kernel = IPython.notebook.kernel;
             cmd = "update_scatter(" + j + ")";
             kernel.execute(cmd, {}, {});
        }
""")

btn = Toggle(label="Play/Stop Animation (from 1965)", button_type="success",
             active=False, callback=callback_animate)

scatter_handle = show(column(btn,year_select,p), notebook_handle=True)

Modelling the Data in Keras

In [82]:
df['year'] = pd.to_numeric(df.date.str.slice(0, 4))
df['month'] = pd.to_numeric(df.date.str.slice(4, 6))
df['day'] = pd.to_numeric(df.date.str.slice(6, 8))

cols = ['year','month','day','yr_renovated',
        'bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront',
        'condition','grade','sqft_above','sqft_basement','yr_built',
        'zipcode','lat','long','sqft_living15','sqft_lot15','price']

model_data = pd.DataFrame(df, columns=cols)
# no id, date, year built or year renovated

Zipcode, Long and Lat might not be good to treat as numbers linear

65%, 80-65=15%, 100-65-15=20% split

In [83]:
train, validate, test = np.split(model_data.sample(frac=1), [int(.65*len(model_data)), int(.80*len(model_data))])
In [84]:
def exclude_price(df):
    return(df.columns[~df.columns.isin(['price'])])

wo_price = exclude_price(train)
x_train, x_validate, x_test = train[wo_price], validate[wo_price], test[wo_price]
y_train, y_validate, y_test = train['price'], validate['price'], test['price']
In [85]:
print('Number of columns in data: ', x_train.shape[1]-1)
print('Size of training set: ', x_train.shape[0])
print('Size of validation set: ', x_validate.shape[0])
print('Size of test set: ', test.shape[0])
Number of columns in data:  19
Size of training set:  14048
Size of validation set:  3242
Size of test set:  4323
In [86]:
print("Number of rows with nulls is %s" %  train[train.isnull().any(axis=1)].shape[0])
Number of rows with nulls is 0

Could Normalise using MinMax

In [87]:
min_max_scaler = MinMaxScaler()

x_train_mnmx = pd.DataFrame(min_max_scaler.fit_transform(x_train.values))
x_validate_mnmx = pd.DataFrame(min_max_scaler.fit_transform(x_validate.values))
x_test_mnmx = pd.DataFrame(min_max_scaler.fit_transform(x_test.values))
corr_df = pd.DataFrame(min_max_scaler.fit_transform(model_data.values))
y_train_mnmx = pd.DataFrame(min_max_scaler.fit_transform(y_train.values.reshape(-1, 1)))
y_validate_mnmx = pd.DataFrame(min_max_scaler.fit_transform(y_validate.values.reshape(-1, 1)))

Or Could Use Mean and Std Dev to Turn Into Z-Score

In [88]:
means = x_train.append(x_validate, ignore_index=True).mean(axis=0)
stds = x_train.append(x_validate, ignore_index=True).std(axis=0)
x_train_zscore = pd.DataFrame()
for c in cols[:(len(cols)-1)]:
    x_train_zscore[c] = (x_train[c]-means[c])/stds[c]
x_validate_zscore = pd.DataFrame()
for c in cols[:(len(cols)-1)]:
    x_validate_zscore[c] = (x_validate[c]-means[c])/stds[c]

ymeans = y_train.append(y_validate, ignore_index=True).mean(axis=0)
ystds = y_train.append(y_validate, ignore_index=True).std(axis=0)
y_train_zscore = pd.DataFrame()
y_train_zscore = (y_train-ymeans)/ystds
y_validate_zscore = pd.DataFrame()
y_validate_zscore = (y_validate-ymeans)/ystds
In [89]:
f, ax = plt.subplots(figsize=(10, 8))
corr = pd.DataFrame(corr_df).corr()
corr.columns = cols
corr.index = cols
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)
Out[89]:
<matplotlib.axes._subplots.AxesSubplot at 0x20dbec806a0>
In [90]:
corr.style.background_gradient().set_precision(2)
Out[90]:
year month day yr_renovated bedrooms bathrooms sqft_living sqft_lot floors waterfront condition grade sqft_above sqft_basement yr_built zipcode lat long sqft_living15 sqft_lot15 price
year 1 -0.78 -0.0061 -0.024 -0.0098 -0.027 -0.029 0.0055 -0.022 -0.0042 -0.046 -0.03 -0.024 -0.016 0.0035 0.0012 -0.029 0.00027 -0.022 -8.5e-05 0.0036
month -0.78 1 -0.061 0.013 -0.0015 0.0074 0.012 -0.0024 0.014 0.0081 0.022 0.0084 0.0099 0.006 -0.0062 -0.00017 0.015 -0.0081 0.0024 0.0035 -0.01
day -0.0061 -0.061 1 0.0081 -0.008 -0.0053 -0.0072 0.00063 -0.0067 0.011 -0.0046 -0.012 -0.0025 -0.01 -0.0058 -0.0033 -0.016 -0.0073 -0.0085 -0.0029 -0.015
yr_renovated -0.024 0.013 0.0081 1 0.019 0.051 0.055 0.0076 0.0063 0.093 -0.061 0.014 0.023 0.071 -0.22 0.064 0.029 -0.068 -0.0027 0.0079 0.13
bedrooms -0.0098 -0.0015 -0.008 0.019 1 0.52 0.58 0.032 0.18 -0.0066 0.028 0.36 0.48 0.3 0.15 -0.15 -0.0089 0.13 0.39 0.029 0.31
bathrooms -0.027 0.0074 -0.0053 0.051 0.52 1 0.75 0.088 0.5 0.064 -0.12 0.66 0.69 0.28 0.51 -0.2 0.025 0.22 0.57 0.087 0.53
sqft_living -0.029 0.012 -0.0072 0.055 0.58 0.75 1 0.17 0.35 0.1 -0.059 0.76 0.88 0.44 0.32 -0.2 0.053 0.24 0.76 0.18 0.7
sqft_lot 0.0055 -0.0024 0.00063 0.0076 0.032 0.088 0.17 1 -0.0052 0.022 -0.009 0.11 0.18 0.015 0.053 -0.13 -0.086 0.23 0.14 0.72 0.09
floors -0.022 0.014 -0.0067 0.0063 0.18 0.5 0.35 -0.0052 1 0.024 -0.26 0.46 0.52 -0.25 0.49 -0.059 0.05 0.13 0.28 -0.011 0.26
waterfront -0.0042 0.0081 0.011 0.093 -0.0066 0.064 0.1 0.022 0.024 1 0.017 0.083 0.072 0.081 -0.026 0.03 -0.014 -0.042 0.086 0.031 0.27
condition -0.046 0.022 -0.0046 -0.061 0.028 -0.12 -0.059 -0.009 -0.26 0.017 1 -0.14 -0.16 0.17 -0.36 0.003 -0.015 -0.11 -0.093 -0.0034 0.036
grade -0.03 0.0084 -0.012 0.014 0.36 0.66 0.76 0.11 0.46 0.083 -0.14 1 0.76 0.17 0.45 -0.18 0.11 0.2 0.71 0.12 0.67
sqft_above -0.024 0.0099 -0.0025 0.023 0.48 0.69 0.88 0.18 0.52 0.072 -0.16 0.76 1 -0.052 0.42 -0.26 -0.00082 0.34 0.73 0.19 0.61
sqft_basement -0.016 0.006 -0.01 0.071 0.3 0.28 0.44 0.015 -0.25 0.081 0.17 0.17 -0.052 1 -0.13 0.075 0.11 -0.14 0.2 0.017 0.32
yr_built 0.0035 -0.0062 -0.0058 -0.22 0.15 0.51 0.32 0.053 0.49 -0.026 -0.36 0.45 0.42 -0.13 1 -0.35 -0.15 0.41 0.33 0.071 0.054
zipcode 0.0012 -0.00017 -0.0033 0.064 -0.15 -0.2 -0.2 -0.13 -0.059 0.03 0.003 -0.18 -0.26 0.075 -0.35 1 0.27 -0.56 -0.28 -0.15 -0.053
lat -0.029 0.015 -0.016 0.029 -0.0089 0.025 0.053 -0.086 0.05 -0.014 -0.015 0.11 -0.00082 0.11 -0.15 0.27 1 -0.14 0.049 -0.086 0.31
long 0.00027 -0.0081 -0.0073 -0.068 0.13 0.22 0.24 0.23 0.13 -0.042 -0.11 0.2 0.34 -0.14 0.41 -0.56 -0.14 1 0.33 0.25 0.022
sqft_living15 -0.022 0.0024 -0.0085 -0.0027 0.39 0.57 0.76 0.14 0.28 0.086 -0.093 0.71 0.73 0.2 0.33 -0.28 0.049 0.33 1 0.18 0.59
sqft_lot15 -8.5e-05 0.0035 -0.0029 0.0079 0.029 0.087 0.18 0.72 -0.011 0.031 -0.0034 0.12 0.19 0.017 0.071 -0.15 -0.086 0.25 0.18 1 0.082
price 0.0036 -0.01 -0.015 0.13 0.31 0.53 0.7 0.09 0.26 0.27 0.036 0.67 0.61 0.32 0.054 -0.053 0.31 0.022 0.59 0.082 1

By correlation, which measures linear relationships, a parsimonious dataset would contain bedrooms, bathrooms, sqft_lving, grade, condition.

In [91]:
small_data_cols = ['bedrooms', 'bathrooms', 'sqft_living', 'grade', 'condition']
sdindex = [cols.index(i) for i in small_data_cols]
small_x_train, small_x_validate, small_x_test = x_train_mnmx[sdindex], x_validate_mnmx[sdindex], x_test_mnmx[sdindex]

Clearly year, month and day are not correlated with the others. But the area ones are. Nothing special though.

In [98]:
def model1(x_size, y_size):
    model = Sequential()
    model.add(Dense(50, activation="tanh", input_shape=(x_size,)))
    model.add(Dense(25, activation="relu"))
    model.add(Dense(y_size))
    print(model.summary())
    model.compile(loss='mean_squared_error',optimizer=RMSprop(),metrics=[metrics.mae])
    return(model)

Use small dataset for model 2

In [99]:
def model2(x_size, y_size):
    model = Sequential()
    model.add(Dense(100, activation="relu", input_shape=(x_size,)))
    BatchNormalization(),
    model.add(Dropout(0.1))
    model.add(Dense(50, activation="relu"))
    BatchNormalization(),
    model.add(Dense(20, activation="sigmoid"))
    BatchNormalization(),
    model.add(Dense(y_size))
    print(model.summary())
    model.compile(loss='mean_absolute_error',optimizer=Adam(),metrics=[metrics.mae])
    return(model)
In [100]:
def model3(x_size, y_size):
    model = Sequential()
    model.add(Dense(80, activation="tanh", kernel_initializer='normal', input_shape=(x_size,)))
    model.add(Dropout(0.2))
    model.add(Dense(120, activation="relu", kernel_initializer='normal', 
        kernel_regularizer=regularizers.l1(0.01), bias_regularizer=regularizers.l1(0.01)))
    model.add(Dropout(0.1))
    model.add(Dense(20, activation="relu", kernel_initializer='normal', 
        kernel_regularizer=regularizers.l1_l2(0.01), bias_regularizer=regularizers.l1_l2(0.01)))
    model.add(Dropout(0.1))
    model.add(Dense(10, activation="relu", kernel_initializer='normal'))
    model.add(Dense(y_size))
    model.compile(loss='mean_squared_error',optimizer='nadam',metrics=[metrics.mae])
    return(model)

Now we create the model - use one of the above functions.

In [101]:
m1 = model1(x_train.shape[1], 1)
m1.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_8 (Dense)              (None, 50)                1050      
_________________________________________________________________
dense_9 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 26        
=================================================================
Total params: 2,351
Trainable params: 2,351
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_8 (Dense)              (None, 50)                1050      
_________________________________________________________________
dense_9 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 26        
=================================================================
Total params: 2,351
Trainable params: 2,351
Non-trainable params: 0
_________________________________________________________________
In [102]:
m2 = model2(small_x_train.shape[1], 1)
m2.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_11 (Dense)             (None, 100)               600       
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_13 (Dense)             (None, 20)                1020      
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 21        
=================================================================
Total params: 6,691
Trainable params: 6,691
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_11 (Dense)             (None, 100)               600       
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_13 (Dense)             (None, 20)                1020      
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 21        
=================================================================
Total params: 6,691
Trainable params: 6,691
Non-trainable params: 0
_________________________________________________________________
In [103]:
m3 = model3(x_train.shape[1], 1)
m3.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_15 (Dense)             (None, 80)                1680      
_________________________________________________________________
dropout_3 (Dropout)          (None, 80)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 120)               9720      
_________________________________________________________________
dropout_4 (Dropout)          (None, 120)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 20)                2420      
_________________________________________________________________
dropout_5 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 10)                210       
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 11        
=================================================================
Total params: 14,041
Trainable params: 14,041
Non-trainable params: 0
_________________________________________________________________
In [104]:
m4 = model3(x_train.shape[1], 1)
m4.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_20 (Dense)             (None, 80)                1680      
_________________________________________________________________
dropout_6 (Dropout)          (None, 80)                0         
_________________________________________________________________
dense_21 (Dense)             (None, 120)               9720      
_________________________________________________________________
dropout_7 (Dropout)          (None, 120)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 20)                2420      
_________________________________________________________________
dropout_8 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 10)                210       
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 11        
=================================================================
Total params: 14,041
Trainable params: 14,041
Non-trainable params: 0
_________________________________________________________________
In [105]:
epochs = 500
batch_size = 128
print('Epochs: ', epochs)
print('Batch size: ', batch_size)
Epochs:  500
Batch size:  128
In [106]:
history1 = m1.fit(x_train_mnmx.values, y_train.values,batch_size=batch_size,epochs=epochs,shuffle=True,verbose=0, 
                  validation_data=(x_validate_mnmx.values, y_validate.values),
                  callbacks=[EarlyStopping(monitor='val_mean_absolute_error', patience=20, verbose=0)])
In [107]:
history2 = m2.fit(small_x_train.values, y_train.values, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=0,
                  validation_data=(small_x_validate.values, y_validate.values),
                  callbacks=[EarlyStopping(monitor='val_mean_absolute_error', patience=20, verbose=0)])
In [108]:
history3 = m3.fit(x_train_mnmx.values, y_train.values,batch_size=batch_size,epochs=epochs,shuffle=True,verbose=0,
                  validation_data=(x_validate_mnmx.values, y_validate.values),
                  callbacks=[EarlyStopping(monitor='val_mean_absolute_error', patience=20, verbose=0)])
In [109]:
history4 = m4.fit(x_train_zscore.values, y_train.values,batch_size=batch_size,epochs=epochs,shuffle=True,verbose=0,
                  validation_data=(x_validate_zscore.values, y_validate.values),
                  callbacks=[EarlyStopping(monitor='val_mean_absolute_error', patience=20, verbose=0)])
In [110]:
training_score = m1.evaluate(x_train_mnmx, y_train, verbose=0)
validation_score = m1.evaluate(x_validate_mnmx, y_validate, verbose=0)
print('Model 1')
print('Training MAE: $%sk\nTraining (Squared) Loss: $%sbn\nValidation MAE: $%sk\nValidation (Squared) Loss: $%sbn\n'\
      %(int(round(training_score[1]/1000, 0)),int(round(training_score[0]/1000000000, 0)), 
        int(round(validation_score[1]/1000, 0)),int(round(validation_score[0]/1000000000, 0))))

training_score = m2.evaluate(small_x_train, y_train, verbose=0)
validation_score = m2.evaluate(small_x_validate, y_validate, verbose=0)
print('Model 2')
print('Training MAE: $%sk\nTraining (Squared) Loss: >= $%sbn\nValidation MAE: $%sk\nValidation (Squared) Loss: >= $%sbn\n'\
      %(int(round(training_score[1]/1000, 0)),int(round(pow(training_score[0],2)/1000000000, 0)), 
        int(round(validation_score[1]/1000, 0)),int(round(pow(validation_score[0],2)/1000000000, 0))))

training_score = m3.evaluate(x_train_mnmx, y_train, verbose=0)
validation_score = m3.evaluate(x_validate_mnmx, y_validate, verbose=0)
print('Model 3')
print('Training MAE: $%sk\nTraining (Squared) Loss: $%sbn\nValidation MAE: $%sk\nValidation (Squared) Loss: $%sbn\n'\
      %(int(round(training_score[1]/1000, 0)),int(round(pow(training_score[0],2)/1000000000, 0)), 
        int(round(validation_score[1]/1000, 0)),int(round(pow(validation_score[0],2)/1000000000, 0))))

training_score = m4.evaluate(x_train_zscore, y_train, verbose=0)
validation_score = m4.evaluate(x_validate_zscore, y_validate, verbose=0)
print('Model 4')
print('Training MAE: $%sk\nTraining (Squared) Loss: $%sbn\nValidation MAE: $%sk\nValidation (Squared) Loss: $%sbn\n'\
      %(int(round(training_score[1]/1000, 0)),int(round(pow(training_score[0],2)/1000000000, 0)), 
        int(round(validation_score[1]/1000, 0)),int(round(pow(validation_score[0],2)/1000000000, 0))))
Model 1
Training MAE: $227k
Training (Squared) Loss: $132bn
Validation MAE: $229k
Validation (Squared) Loss: $147bn

Model 2
Training MAE: $539k
Training (Squared) Loss: >= $291bn
Validation MAE: $539k
Validation (Squared) Loss: >= $291bn

Model 3
Training MAE: $110k
Training (Squared) Loss: $983010550356bn
Validation MAE: $200k
Validation (Squared) Loss: $8831463880517bn

Model 4
Training MAE: $64k
Training (Squared) Loss: $97836282049bn
Validation MAE: $73k
Validation (Squared) Loss: $506177185210bn

In [111]:
p = bplt.figure(plot_width=900,title="Model 4 Performance",
           x_axis_label = 'Epoch', y_axis_label = "Mean Absolute Error ($k)")

perf = pd.DataFrame(
    {'training': [i/1000 for i in history4.history['mean_absolute_error']],
     'validation': [i/1000 for i in history4.history['val_mean_absolute_error']],
     'epoch': [i for i in range(len(history4.history['val_mean_absolute_error']))],
    })
source_perf = ColumnDataSource(perf)

p.line('epoch','training',line_width=2, color='purple', legend="Training", source=source_perf)
p.line('epoch','validation',line_width=2, color='pink', legend="Validation", source=source_perf)
h = HoverTool(tooltips=[('Train MAE ($k)', '@training'),('Validation MAE ($k)', '@validation'),('Epoch', '@epoch')])
p.add_tools(h)
p.legend.location = "top_right"
p.legend.click_policy="hide"
style(p)
show(p)

Train Model 4 again with both the training set and the validation set. Then use compare its predictions on the test set with the actual values.

In [112]:
test_means = x_test.mean(axis=0)
test_stds = x_test.std(axis=0)
x_test_zscore = pd.DataFrame()
for c in cols[:(len(cols)-1)]:
    x_test_zscore[c] = (x_test[c]-test_means[c])/test_stds[c]
In [113]:
m5 = model3(x_train.shape[1], 1)
new_x_train = x_train_zscore.append(x_validate_zscore, ignore_index=True)
new_y_train = y_train.append(y_validate, ignore_index=True)
history5 = m5.fit(new_x_train.values, new_y_train.values,batch_size=batch_size,epochs=epochs,shuffle=True,verbose=0,
                  validation_data=(x_test_zscore.values, y_test.values),
                  callbacks=[EarlyStopping(monitor='val_mean_absolute_error', patience=40, verbose=0)])
In [114]:
p = bplt.figure(plot_width=900,title="Model 4 Performance Using Validation and Training Data",
           x_axis_label = 'Epoch', y_axis_label = "Mean Absolute Error ($k)")

perf2 = pd.DataFrame(
    {'training': [i/1000 for i in history5.history['mean_absolute_error']],
     'validation': [i/1000 for i in history5.history['val_mean_absolute_error']],
     'epoch': [i for i in range(len(history5.history['val_mean_absolute_error']))],
    })
source_perf2 = ColumnDataSource(perf2)

p.line('epoch','training',line_width=2, color='purple', legend="Training", source=source_perf2)
p.line('epoch','validation',line_width=2, color='blue', legend="Test", source=source_perf2)
h = HoverTool(tooltips=[('Train MAE ($k)', '@training'),('Test MAE ($k)', '@validation'),('Epoch', '@epoch')])
p.add_tools(h)
p.legend.location = "top_right"
p.legend.click_policy="hide"
style(p)
show(p)
In [115]:
training_score = m5.evaluate(new_x_train, new_y_train, verbose=0)
validation_score = m5.evaluate(x_test_zscore, y_test, verbose=0)
print('Model 4 Trained on Training and Validation Datasets')
print('Training MAE: $%sk\nTraining (Squared) Loss: $%sbn\nValidation MAE: $%sk\nValidation (Squared) Loss: $%sbn\n'\
      %(int(round(training_score[1]/1000, 0)),int(round(pow(training_score[0],2)/1000000000, 0)), 
        int(round(validation_score[1]/1000, 0)),int(round(pow(validation_score[0],2)/1000000000, 0))))
Model 4 Trained on Training and Validation Datasets
Training MAE: $63k
Training (Squared) Loss: $90586621929bn
Validation MAE: $74k
Validation (Squared) Loss: $278769806569bn

In [ ]:
test_y_predictions = m5.predict(x_test_zscore)
pred_pd = pd.DataFrame(
    {'predicted': [log(i[0]/1000) for i in test_y_predictions.tolist()],
     'actual': [log(i/1000) for i in y_test],
     'predicted_nl': [int(i[0]/1000) for i in test_y_predictions.tolist()],
     'actual_nl': [int(i/1000) for i in y_test]
    })
mn = min(min(pred_pd['predicted']),min(pred_pd['actual']))
mx = max(max(pred_pd['predicted']),max(pred_pd['actual']))
pred_line = pd.DataFrame(
    {'x': [mn,mx],
     'y': [mn,mx]
    })
source_pred = ColumnDataSource(pred_pd)
source_line = ColumnDataSource(pred_line)
p = bplt.figure(plot_width=900,title="Actual House Prices vs Predicted House Prices",\
           x_axis_label = "Log of Predicted", y_axis_label = 'Log of Actual')
p.scatter(x='predicted', y='actual', source=source_pred)
h = HoverTool(tooltips=[('Actual Price ($k)', '@actual_nl'),('Predicted Price ($k)', '@predicted_nl')])
p.add_tools(h)
p.line(x='x',y='y',line_width=1, line_dash='dashed', color='red', source=source_line)
p.legend.location = "bottom_right"
style(p)
show(p)
In [ ]:
with open('model4_predictions.pckl', 'wb') as handle:
    pickle.dump(test_y_predictions, handle)
handle.close()
with open('model4_actual.pckl', 'wb') as handle:
    pickle.dump(y_test, handle)
handle.close()
with open('model4_actual_extended.pckl', 'wb') as handle:
    pickle.dump(test, handle)
handle.close()